import re
from openpyxl import Workbook

wb = Workbook()
ws = wb.active
ws.append(["排名","电影名称","主演","上映时间","评分"])

#制定提取dd标签内容的规则
dd_pattern = re.compile(r'<dd>(.*?)</dd>',re.S)
#制定获取电影排名的规则
rank_pattern = re.compile(r'<i class="board-index board-index-\d*">(.*?)</i>',re.S)
#制定获取电影名称的规则
movie_pattern = re.compile(r'<p class="name"><a .*?>(.*?)</a>',re.S)
#指定提取主演内容规则
star_pattern = re.compile(r'<p class="star">(.*?)</p>',re.S)
#制定获取上映时间规则
time_pattern = re.compile(r'<p class="releasetime">(.*?)</p>',re.S)
#制定获取评分的规则
score1_pattern = re.compile(r'<i class="integer">(.*?)</i>',re.S)
score2_pattern = re.compile(r'<i class="fraction">(.*?)</i>',re.S)

for i in range(1,11):
    with open(f"./data/猫眼/猫眼-{i}.html","r",encoding="utf-8")as f:
        html = f.read()
        dd_list = dd_pattern.findall(html)
        for dd in dd_list:
            #电影排名
            rank = rank_pattern.findall(dd)[0]
            #获取电影名
            movie = movie_pattern.findall(dd)[0]
            #获取电影主演
            star = star_pattern.findall(dd)[0].replace('\n','').replace(' ','')
            #获取电影上映时间
            time = time_pattern.findall(dd)[0][5:]
            #获取电影评分
            score1= score1_pattern.findall(dd)[0]
            score2 = score2_pattern.findall(dd)[0]
            score = score1+score2
            print(rank,movie,star,time,score)
            ws.append([rank,movie,star,time,score])
wb.save("data/猫眼/猫眼榜单.xlsx")


